This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
##load the dataset
semester_data_1 <- read_csv("raw/semester_dummy/semester_data_1.csv")
## Rows: 6632 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): x1, x2, x3, x4, x5, x6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
semester_data_2 <- read_csv("raw/semester_dummy/semester_data_2.csv")
## Rows: 7258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): x2
## dbl (5): x1, x3, x4, x5, x6
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(semester_data_1)
head(semester_data_2)
##rename the column names in "semester_data_1"
semester_data_1 <- semester_data_1 |>
rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
head(semester_data_1)
##convert the character into double (numeric)
##then, delete the first row of the "semester_data_1"
semester_data_1 <- semester_data_1 |>
mutate(unitid = as.numeric(unitid), semester = as.numeric(semester), quarter = as.numeric(quarter), year = as.numeric(year), Y = as.numeric(Y)) |>
slice(-1)
## Warning: There were 5 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `unitid = as.numeric(unitid)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 4 remaining warnings.
head(semester_data_1)
##rename the column names in "semester_data_2"
semester_data_2 <- semester_data_2 |>
rename (unitid = x1, instnm = x2, semester = x3, quarter = x4, year = x5, Y = x6)
head(semester_data_2)
##join the two data sets with bind_rows() because they have exactly the same columns
semester_data <- bind_rows(semester_data_1, semester_data_2)
head(semester_data)
##delete the column "Y"
semester_data <- semester_data |>
select (-Y) #|>
##mutate(instnm = toupper(instnm))
head(semester_data)
##identify the transitioned year for each university
transition_year <- semester_data |>
group_by(unitid) |>
mutate(prev_semester = lag(semester),
prev_quarter = lag(quarter)) |>
filter(prev_semester == 0 & semester == 1 & prev_quarter == 1 & quarter == 0) |>
rename(yearofsem = year) |>
select (unitid, yearofsem) #should not add "instnm" here as upper cases and lower cases are mixed
head(transition_year)
##use left_join to merge the two data sets ("semester_data" and "transition_year")
semester_dummy <- semester_data |>
left_join (transition_year, by = "unitid") #joined only by "unitid"
head(semester_dummy)
##finalizing the cleaning of the semester data
##create a dummy variable which shows 0 if semesters were yet to be introduced and 1 if already introduced
cleaned_semester_dummy <- semester_dummy |>
group_by(unitid) |>
mutate(after = if_else(year < yearofsem, "0", "1")) |>
mutate(after = as.numeric(after))
cleaned_semester_dummy
##checking the number of NAs in "cleaned_semester_dummy"
#sum(is.na(cleaned_semester_dummy)) #total number of NAs: 25688
##the semester data set provided by the organizer
#semester_by_organizer <- read_csv("intermediate_by_organizer/clean_semester_dummy.csv") |>
#select(-1)
#s#emester_by_organizer
#checking the number of NAs in "semester_by_organizer"
#sum(is.na(semester_by_organizer)) #identical with "cleaned_semester_dummy"
##save this cleaned data set "cleaned_semester_dummy" as a .csv file
##remember to add ".csv" at the end
#write_csv(cleaned_semester_dummy, "cleaned_semester_aomi.csv")
##use the intermediate data for the tentative use
##create empty tibble
df_gradrate <- tibble()
##loop through each year from 1991 to 2016
for (i in c(c(1991:1993), c(1995:2016))) { #excluding 1994
#construct the filename
filename <- paste0("raw/outcome_csv_aomi/", i, ".csv")
gradrate_data <- read_csv(filename)
df_gradrate <- bind_rows(df_gradrate, gradrate_data) #repeat binding "df_gradrate" and "gradrate_data"
}
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 731 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): women_gradrate_4yr
## dbl (8): unitid, year, totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrad...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1249 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1250 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1248 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1247 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1244 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 1241 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): unitid, year
## lgl (7): totcohortsize, w_cohortsize, m_cohortsize, tot4yrgrads, m_4yrgrads,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_gradrate)
##calculate female's 4-year graduation rate
df_gradrate <- df_gradrate |>
mutate(women_gradrate_4yr = as.numeric(women_gradrate_4yr)) |>
mutate(womengradrate4yr = 0.01 * women_gradrate_4yr)
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `women_gradrate_4yr = as.numeric(women_gradrate_4yr)`.
## Caused by warning:
## ! NAs introduced by coercion
head(df_gradrate)
#calculate total 4-year graduation rate, then covert into numeric factors
#calculate male's 4-year graduation rate, then covert into numeric factors
df_gradrate <- df_gradrate |>
mutate(gradrate4yr = tot4yrgrads / totcohortsize,
mengradrate4yr = m_4yrgrads / m_cohortsize)
head(df_gradrate)
##use round() to control the number of digit
df_gradrate <- df_gradrate |>
mutate(gradrate4yr = round(gradrate4yr, digits = 3)) |>
mutate(mengradrate4yr = round(mengradrate4yr, digits = 3))
head(df_gradrate)
##limit the range of years in the data set
outcome_data <- df_gradrate |>
filter (year >= 1991 & year <= 2010)
head(outcome_data)
#write_csv(outcome_data, "cleaned_outcome_aomi.csv")
##load the data
covariates <- read_csv("raw/covariates_csv_aomi/covariates.csv")
## Rows: 149408 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): university_id, category
## dbl (2): year, value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(covariates)
##rename the column name from "university_id" into "unitid"
##then, create a new column with the same name while removing "aaaa" from each row
covariates <- covariates |>
rename(unitid = university_id) |>
mutate(unitid = str_remove_all(unitid, "aaaa"))
head(covariates)
##see all the unique values in the "category" column
#unique(covariates$category)
##use "pivot_wider" to transfer the dataset
covariates <- covariates |>
pivot_wider(names_from = category,
values_from = value)
head(covariates)
##recall "cleaned_semester_dummy"
#cleaned_semester_dummy
##recall "outcome_data"
#outcome_data
##see all the unique values existing in the "year" column of "cleaned_semester_dummy" data
unique(cleaned_semester_dummy$year)
## [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##see all the unique values existing in the "year" column of "outcome_data"
unique(outcome_data$year)
## [1] 1991 1992 1993 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
## [16] 2007 2008 2009 2010
##limit the year duration for "covariates" data
covariates <- covariates |>
group_by(unitid) |>
filter (year >= 1991 & year != 1994 & year <= 2010)
head(covariates)
##see all the unique values existing in the "unitid" column in the "outcome_data" data
#unique(outcome_data$unitid)
##see all the unique values existing in the "unitid" column in the "covariates" data
#unique(covariates$unitid)
##filter the "unitid" column to keep only the rows that have the same value in "outcome_data" data set.
#pipe operators cannot be used within functions
#"unitid == outcome_data$unitid": This checks if unitid is equal to the entire vector outcome$unitid, which is not what you want if outcome$unitid contains multiple values. It would result in a logical vector of TRUE/FALSE that doesn't directly help in filtering rows.
#"unitid %in% outcome_data$unitid": This checks if each value in unitid is present in the outcome$unitid vector. It creates a logical vector where each element is TRUE if the corresponding unitid value is found in outcome$unitid and FALSE otherwise. This is useful for filtering rows based on the presence of values in a list or vector.
covariates_data <- covariates |>
filter(unitid %in% (outcome_data$unitid)) |>
#convert characters into double ("unitid" column)
mutate(unitid = as.numeric(unitid))
head(covariates_data)
#write_csv(covariates_data, "cleaned_covariates_aomi.csv")
##Semester Data: cleaned_semester_dummy
##Gradrate Data: outcome_data
##Covariates Data: covariates_data
cleaned_semester_dummy
outcome_data
covariates_data
##combine three data sets by "unitid (double)"
pre_master_data <- cleaned_semester_dummy |>
left_join(outcome_data, by = c("unitid", "year"))
pre_master_data
master_data <- pre_master_data |>
left_join(covariates_data, by = c("unitid", "year")) |>
#calculate the ratio of women cohorts and white cohorts, respectively
mutate(per_white_cohort = white_cohortsize / totcohortsize,
per_women_cohort = w_cohortsize / totcohortsize)
master_data
### Compare my cleaned data and data by organizer (covariates data) : identical!
##mine
#sum(is.na(covariates_data)) #total number of NAs: 0
##by organizer
#covariates_by_organizer <- read_csv("intermediate_by_organizer/clean_covariates.csv")
#sum(is.na(covariates_by_organizer)) #total number of NAs: 0
#write_csv(master_data, "master_data_aomi.csv")
##count the total number of NAs in "master_data"
master_data
#sum(is.na(master_data)) #total number of NAs: 25801
##load the cleaned master data provided by the organizers
#Master <- read_csv("intermediate_by_organizer/master.csv")
##count the number of NAs in each column in "master_data"
master_data |>
apply(2, function(x) sum(is.na(x))) #1 for row, 2 for column
## unitid instnm semester quarter
## 0 0 0 0
## year yearofsem after totcohortsize
## 0 12844 12844 0
## w_cohortsize m_cohortsize tot4yrgrads m_4yrgrads
## 0 0 0 0
## w_4yrgrads women_gradrate_4yr womengradrate4yr gradrate4yr
## 0 24 24 0
## mengradrate4yr instatetuition costs faculty
## 65 0 0 0
## white_cohortsize per_white_cohort per_women_cohort
## 0 0 0
##switchers: TRUE / never-switchers: FALSE
#abs(): absolute value
transition <- master_data |>
group_by(unitid) |>
mutate(semester_change = abs(semester - lag(semester, default = first(semester))),
quarter_change = abs(quarter - lag(quarter, default = first(quarter))),
switching = (semester_change + quarter_change) > 0) |>
filter(switching == TRUE) |>
select(unitid, switching)
##table: dataset containing the column called "switching"
table <- master_data |>
left_join(transition, by = "unitid") |>
mutate(switching = if_else(is.na(switching),"Never-Switchers", "Switchers"))
table
#library(table1)
library(ggplot2)
avg_gradrate4yr <- master_data |>
group_by(year) |>
mutate(avg_gradrate4yr = mean(gradrate4yr))
avg_gradrate4yr |>
ggplot(mapping = aes(x = year,
y = avg_gradrate4yr)) +
geom_line() +
xlim(1990, 2010) + #set the minimum and maximum limits on the x-axis
ylim(0.25, 0.45) +
labs(x = "Year",
y = "4-year Graduation Rate")
semesters_share <- master_data |>
group_by(year) |>
mutate(semesters_share = sum(semester) / n())
semesters_share |>
ggplot(mapping = aes(x = year,
y = semesters_share)) +
geom_line() +
xlim(1990, 2010) +
ylim(0.8, 1.0) +
labs (x = "Year",
y = "Fractions of Schools on Semesters")
###Scatter Plot
#colors()
##if creating the scatter plots manually:
#master_data |>
#ggplot(mapping = aes(x = per_women_cohort, y = gradrate4yr)) +
#geom_point(color = "skyblue", alpha = 0.5)
#master_data |>
#ggplot(mapping = aes(x = per_white_cohort, y = gradrate4yr)) +
#geom_point(color = "violet", alpha = 0.5)
#master_data |>
#ggplot(mapping = aes(x = instatetuition, y = gradrate4yr)) +
#geom_point(color = "springgreen", alpha = 0.5)
##if using rlang package
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
library(ggplot2)
plot_gradrate <- function(data, xvar, color_value, xaxis) {
xvar <- enquo(xvar) #quosing
ggplot(data, mapping = aes(x = !!xvar, y = gradrate4yr)) + #dynamic
geom_point(color = color_value, alpha = 0.5) + #static
labs(x = xaxis, #static
y = "4-year Graduation Rate")
}
plot_gradrate(master_data, per_women_cohort, "skyblue", "Female Student Shares")
plot_gradrate(master_data, per_white_cohort, "violet", "White Student Shares")
plot_gradrate(master_data, instatetuition, "springgreen", "Tuition")
### Regression Analysis
##Independent Variable: semester
##Dependent Variable: gradrate4yr
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
##run linear regression
model <- lm(gradrate4yr ~ semester, data = master_data)
summary(model)
##
## Call:
## lm(formula = gradrate4yr ~ semester, data = master_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37209 -0.18809 -0.03809 0.16391 0.58991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.336755 0.007295 46.162 < 2e-16 ***
## semester 0.036336 0.007560 4.806 1.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2257 on 13887 degrees of freedom
## Multiple R-squared: 0.001661, Adjusted R-squared: 0.001589
## F-statistic: 23.1 on 1 and 13887 DF, p-value: 1.555e-06
##controling per_women_cohort, per_white_cohort, instatetuition
#model_2 <- lm(gradrate4yr ~ semester + per_women_cohort + per_white_cohort + instatetuition, data = master_data)
#summary(model_2)
##create and display tables
modelsummary(model)
| (1) | |
|---|---|
| (Intercept) | 0.337 |
| (0.007) | |
| semester | 0.036 |
| (0.008) | |
| Num.Obs. | 13889 |
| R2 | 0.002 |
| R2 Adj. | 0.002 |
| AIC | -1932.3 |
| BIC | -1909.7 |
| Log.Lik. | 969.166 |
| RMSE | 0.23 |